In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from datetime import datetime, timedelta
import plotly.io as pio
pio.renderers.default = "notebook"
df = pd.read_csv("member_role_state_data_project.csv", parse_dates=["subscribe_date", "upgrade_date"])
Part 1: Most Common Role in Each State¶
In [2]:
# Format state codes to Plotly format ("XX" capital letters, no white space)
df["state"] = df["state"].str.strip().str.upper()
# Group by state and role, count how many members for each combo
role_counts = df.groupby(["state", "member_role"]).size().reset_index(name="count")
# For each state, keep only the role with the max count
top_roles = role_counts.loc[role_counts.groupby("state")["count"].idxmax()].reset_index(drop=True)
# Add dummy rows for job roles that aren't maximums in each state with fake identifiers, so they still show up in legend
## This is a personal touch, I thought it was more helpful to see what roles were being unrepresented
dummy_states = [f"DummyState{i}" for i in range(len(top_roles["state"].unique()))]
dummy_mapping = dict(zip(top_roles["state"].unique(), dummy_states))
top_roles["dummy_state"] = top_roles["state"].map(dummy_mapping)
all_roles = df["member_role"].unique()
for dummy_state in dummy_states:
for role in all_roles:
if not ((top_roles["dummy_state"] == dummy_state) & (top_roles["member_role"] == role)).any():
top_roles = pd.concat([top_roles, pd.DataFrame({"dummy_state": [dummy_state], "member_role": [role], "count": [0]})], ignore_index=True)
# Plot
fig = px.choropleth(
top_roles,
locations="state",
locationmode="USA-states",
color="member_role",
scope="usa",
title="Most Common Job Role Among Site Members, by State",
color_discrete_sequence=px.colors.qualitative.Dark24, # Although the 24 palettes are not my favorite, they have enough colors to cover all the roles
hover_data={
"state": True,
"member_role": True,
"count": True
},
labels={
"state": "State",
"member_role": "Job Role",
"count": "Member Count"
},
height=600,
width=1000,
)
fig.show()
In [3]:
# Filter for members in the "Technology" role
tech_df = df[df["member_role"] == "Technology"].copy()
# Count total members and tech members by state
total_by_state = df["state"].value_counts()
tech_by_state = tech_df["state"].value_counts()
# Build DataFrame with index values
tech_index_df = pd.DataFrame({
"tech_pct": tech_by_state / total_by_state,
"total_members": total_by_state
})
# Calculate national % of tech roles
national_pct = len(tech_df) / len(df)
# Index: state % divided by national %
tech_index_df["tech_index"] = tech_index_df["tech_pct"] / national_pct
# Reset index for Plotly
tech_index_df = tech_index_df.reset_index().rename(columns={"index": "state"})
# Plot
fig = px.choropleth(
tech_index_df,
locations="state",
locationmode="USA-states",
color="tech_index",
scope="usa",
title="Technology Member Index, by State",
subtitle="Tech Index: How each state's share of Technology members compares to the national average (1.0 = average)",
color_continuous_scale="rdbu",
hover_data={
"state": True,
"tech_index": True,
"total_members": True
},
labels={
"state": "State",
"tech_index": "Tech Index",
"total_members": "Total Members"
},
height=600,
width=1000,
)
fig.show()
Part 2b: Product & Program Management Index¶
In [4]:
# Filter for members in the "Technology" role
PM_df = df[df["member_role"] == "Project & Program Management"].copy()
# Count PM by state
PM_by_state = PM_df["state"].value_counts()
# Build DataFrame with index values
PM_index_df = pd.DataFrame({
"PM_pct": PM_by_state / total_by_state,
"total_members": total_by_state
})
# Calculate national % of tech roles
national_pct = len(PM_df) / len(df)
# Index: state % divided by national %
PM_index_df["PM_index"] = PM_index_df["PM_pct"] / national_pct
# Reset index for Plotly
PM_index_df = PM_index_df.reset_index().rename(columns={"index": "state"})
# Plot
fig = px.choropleth(
PM_index_df,
locations="state",
locationmode="USA-states",
color="PM_index",
scope="usa",
title="Project & Program Management Member Index, by State",
subtitle="PM Index: How each state's share of Project & Program Management members compares to the national average (1.0 = average)",
color_continuous_scale="rdbu",
hover_data={
"state": True,
"PM_index": True,
"total_members": True
},
labels={
"state": "State",
"PM_index": "PM Index",
"total_members": "Total Members"
},
height=600,
width=1000,
)
fig.show()
In [5]:
# Filter dataset to only include past 6 months
six_months_ago = datetime.now() - timedelta(days=180)
df_recent = df[df["subscribe_date"] > six_months_ago]
# Eliminate members who have not upgraded
df_recent = df_recent.dropna(subset=["upgrade_date"])
# Find members who upgraded within 24 hours of subscribing
df_recent["upgraded_within_24h"] = df_recent["upgrade_date"] - df_recent["subscribe_date"] <= pd.Timedelta(hours=24)
# Count by state
upgraded_within_24h_by_state = df_recent.groupby("state")["upgraded_within_24h"].sum().sort_values(ascending=False)
# Count by role
upgraded_within_24h_by_role = df_recent.groupby("member_role")["upgraded_within_24h"].sum().sort_values(ascending=False)
#### Report and Visualize Results ####
### STATE
print("****\nSTATE")
print("State with highest rate of 24-hour upgrades (past 6 months):")
print(f"{upgraded_within_24h_by_state.index[0]} -- {upgraded_within_24h_by_state.max()} total upgrades within 24 hours")
# Convert Series to DataFrame for Plotly
upgraded_within_24h_by_state_df = upgraded_within_24h_by_state.reset_index()
upgraded_within_24h_by_state_df.columns = ["state", "upgraded_within_24h"]
# Choropleth for state
fig_state = px.choropleth(
upgraded_within_24h_by_state_df,
locations="state",
locationmode="USA-states",
color="upgraded_within_24h",
scope="usa",
title="24-hour upgrades by State (past 6 months)",
color_continuous_scale="blues",
hover_data={
"state": True,
"upgraded_within_24h": True
},
labels={
"state": "State",
"upgraded_within_24h": "Total 24-hour upgrades"
},
height=600,
width=1000,
)
fig_state.show()
### ROLE
print("****\nROLE")
print("Role with highest rate of 24-hour upgrades (past 6 months):")
print(f"{upgraded_within_24h_by_role.index[0]} -- {upgraded_within_24h_by_role.max()} total upgrades within 24 hours")
# Bar chart for role
fig_bar = px.bar(
upgraded_within_24h_by_role,
x=upgraded_within_24h_by_role.index,
y=upgraded_within_24h_by_role.values,
title="24-hour upgrades by Role (past 6 months)",
labels={
"x": "Role",
"y": "Total 24-hour upgrades"
},
color=upgraded_within_24h_by_role.values,
color_continuous_scale="blues",
color_continuous_midpoint=0,
height=600,
width=1000,
)
fig_bar.update_layout(
xaxis_title="Role",
yaxis_title="Total 24-hour upgrades",
coloraxis_showscale=False # Remove color bar for clarity
)
fig_bar.show()
**** STATE State with highest rate of 24-hour upgrades (past 6 months): CA -- 304 total upgrades within 24 hours
**** ROLE Role with highest rate of 24-hour upgrades (past 6 months): Operations & General Management -- 505 total upgrades within 24 hours